#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Time    :   2021/04/19 10:35:12
@Author  :   Leo Wood 
@Contact :   leowood@foxmail.com
'''

import re

# texts = []
# with open('/Users/leo/Data/项目数据/文德数慧-文本内容审核/分类实验/数据/文德语料_黄/youma.txt','r') as f:
#     for line in f.readlines():
#         line = line.strip()
#         line = line.replace('中字-','').replace('精选极品高颜值美乳正妹女优啪啪啪','')
#         line = re.sub(r'[0-9]+', '', line)
#         line = line.split('-')[0]
#         texts.append(line)
# texts = set(texts)
# with open('/Users/leo/Data/项目数据/文德数慧-文本内容审核/分类实验/数据/文德语料_黄_挑选/youma.txt','w') as f:
#     for t in texts:
#         t = t.strip()
#         if len(t)>=5:
#             f.write(t + '\n')

# texts = []
# with open('/Users/leo/Data/项目数据/文德数慧-文本内容审核/分类实验/数据/文德语料_黄/wuma.txt','r') as f:
#     for line in f.readlines():
#         line = line.strip()
#         line = line.replace('精选-','').replace('无码流出','').replace('模型集合-','')
#         line = re.sub(r'[0-9]+', '', line)
#         line = line.split('-')[0]
#         texts.append(line)
# texts = set(texts)
# with open('/Users/leo/Data/项目数据/文德数慧-文本内容审核/分类实验/数据/文德语料_黄_挑选/wuma.txt','w') as f:
#     for t in texts:
#         t = t.strip()
#         if len(t)>=5:
#             f.write(t + '\n')
        
# texts = []
# with open('/Users/leo/Data/项目数据/文德数慧-文本内容审核/分类实验/数据/文德语料_黄/oumei.txt','r') as f:
#     for line in f.readlines():
#         line = line.strip()
#         line = line.replace('第一视角-','').replace('精选-','')
#         line = re.sub(r'[0-9]+', '', line)
#         texts.append(line)
# texts = set(texts)
# with open('/Users/leo/Data/项目数据/文德数慧-文本内容审核/分类实验/数据/文德语料_黄_挑选/oumei.txt','w') as f:
#     for t in texts:
#         t = t.strip()
#         if len(t)>=5:
#             f.write(t + '\n')

texts = []
with open('/Users/leo/Data/项目数据/文德数慧-文本内容审核/分类实验/数据/文德语料_黄/guochan.txt','r') as f:
    for line in f.readlines():
        line = line.strip()
        line = line.replace('国产AV剧情-','').replace('精选-','').replace('精选-','').replace('JVID-','').replace('JVID','').replace('swag','').replace('国产AV-','')
        # line = re.sub(r'[0-9]+', '', line)
        texts.append(line)
texts = set(texts)
with open('/Users/leo/Data/项目数据/文德数慧-文本内容审核/分类实验/数据/文德语料_黄_挑选/guochan.txt','w') as f:
    for t in texts:
        t = t.strip()
        if len(t)>=5:
            f.write(t + '\n')